{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "####  PCA降维\n",
    "Otto_tfidf数据集"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "导入包和模块"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "D:\\Program Files\\Anaconda\\lib\\importlib\\_bootstrap.py:219: RuntimeWarning: numpy.ufunc size changed, may indicate binary incompatibility. Expected 216, got 192\n",
      "  return f(*args, **kwds)\n",
      "D:\\Program Files\\Anaconda\\lib\\importlib\\_bootstrap.py:219: RuntimeWarning: numpy.ufunc size changed, may indicate binary incompatibility. Expected 216, got 192\n",
      "  return f(*args, **kwds)\n",
      "D:\\Program Files\\Anaconda\\lib\\importlib\\_bootstrap.py:219: RuntimeWarning: numpy.ufunc size changed, may indicate binary incompatibility. Expected 192 from C header, got 216 from PyObject\n",
      "  return f(*args, **kwds)\n",
      "D:\\Program Files\\Anaconda\\lib\\importlib\\_bootstrap.py:219: RuntimeWarning: numpy.ufunc size changed, may indicate binary incompatibility. Expected 192 from C header, got 216 from PyObject\n",
      "  return f(*args, **kwds)\n"
     ]
    }
   ],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "from sklearn.decomposition import PCA"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "准备数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>feat_1_tfidf</th>\n",
       "      <th>feat_2_tfidf</th>\n",
       "      <th>feat_3_tfidf</th>\n",
       "      <th>feat_4_tfidf</th>\n",
       "      <th>feat_5_tfidf</th>\n",
       "      <th>feat_6_tfidf</th>\n",
       "      <th>feat_7_tfidf</th>\n",
       "      <th>feat_8_tfidf</th>\n",
       "      <th>feat_9_tfidf</th>\n",
       "      <th>...</th>\n",
       "      <th>feat_85_tfidf</th>\n",
       "      <th>feat_86_tfidf</th>\n",
       "      <th>feat_87_tfidf</th>\n",
       "      <th>feat_88_tfidf</th>\n",
       "      <th>feat_89_tfidf</th>\n",
       "      <th>feat_90_tfidf</th>\n",
       "      <th>feat_91_tfidf</th>\n",
       "      <th>feat_92_tfidf</th>\n",
       "      <th>feat_93_tfidf</th>\n",
       "      <th>target</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>0.081393</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.075886</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>Class_1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.231403</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>Class_1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.199730</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>Class_1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>0.011987</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.011668</td>\n",
       "      <td>0.105971</td>\n",
       "      <td>0.021681</td>\n",
       "      <td>0.080435</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.008244</td>\n",
       "      <td>0.022456</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>Class_1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.124622</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.145988</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>Class_1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 95 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   id  feat_1_tfidf  feat_2_tfidf  feat_3_tfidf  feat_4_tfidf  feat_5_tfidf  \\\n",
       "0   1      0.081393           0.0           0.0      0.000000      0.000000   \n",
       "1   2      0.000000           0.0           0.0      0.000000      0.000000   \n",
       "2   3      0.000000           0.0           0.0      0.000000      0.000000   \n",
       "3   4      0.011987           0.0           0.0      0.011668      0.105971   \n",
       "4   5      0.000000           0.0           0.0      0.000000      0.000000   \n",
       "\n",
       "   feat_6_tfidf  feat_7_tfidf  feat_8_tfidf  feat_9_tfidf   ...     \\\n",
       "0      0.000000      0.000000      0.000000           0.0   ...      \n",
       "1      0.000000      0.000000      0.231403           0.0   ...      \n",
       "2      0.000000      0.000000      0.199730           0.0   ...      \n",
       "3      0.021681      0.080435      0.000000           0.0   ...      \n",
       "4      0.000000      0.000000      0.000000           0.0   ...      \n",
       "\n",
       "   feat_85_tfidf  feat_86_tfidf  feat_87_tfidf  feat_88_tfidf  feat_89_tfidf  \\\n",
       "0       0.075886       0.000000       0.000000            0.0            0.0   \n",
       "1       0.000000       0.000000       0.000000            0.0            0.0   \n",
       "2       0.000000       0.000000       0.000000            0.0            0.0   \n",
       "3       0.000000       0.008244       0.022456            0.0            0.0   \n",
       "4       0.124622       0.000000       0.000000            0.0            0.0   \n",
       "\n",
       "   feat_90_tfidf  feat_91_tfidf  feat_92_tfidf  feat_93_tfidf   target  \n",
       "0       0.000000            0.0            0.0            0.0  Class_1  \n",
       "1       0.000000            0.0            0.0            0.0  Class_1  \n",
       "2       0.000000            0.0            0.0            0.0  Class_1  \n",
       "3       0.000000            0.0            0.0            0.0  Class_1  \n",
       "4       0.145988            0.0            0.0            0.0  Class_1  \n",
       "\n",
       "[5 rows x 95 columns]"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dpath = \"./data/\"\n",
    "train = pd.read_csv(dpath + \"Otto_FE_train_tfidf.csv\")\n",
    "train.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(61878, 95)"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train = train.iloc[:,1:94]\n",
    "y_train = train[\"target\"]\n",
    "train_id = train[\"id\"]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "PCA降维"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "pca = PCA(n_components=0.85)\n",
    "pca.fit(X_train)\n",
    "X_train_pca = pca.transform(X_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(61878, 48)"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_train_pca.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAX0AAAD8CAYAAACb4nSYAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvOIA7rQAAEA9JREFUeJzt3H+s3fVdx/Hny3bA3CJsUM1swXahmnUZTnfXLXHqMpQVmesSIZb5oyaYajLMjJuzGMe2OhMwZt0SMbEZaGVqIcwfN1IlBOaPLAvrZbgfHZLdIY5ryShrx8QFurK3f5wv2fFwy/3ee8/t5d7P85E0fL+f7+d7zucTcl+fz/mc7/mkqpAkteG7lrsBkqTTx9CXpIYY+pLUEENfkhpi6EtSQwx9SWqIoS9JDTH0Jakhhr4kNWTtcjdg1HnnnVcbN25c7mZI0opy7733PlZV6+aq97wL/Y0bNzI1NbXczZCkFSXJf/Wp5/KOJDXE0Jekhhj6ktQQQ1+SGmLoS1JDDH1JaoihL0kNMfQlqSGGviQ1pNcvcpNsAz4CrAE+WlXXjVz/CeDDwEXAjqq6bejaTuD3utMPVtX+cTT8VDbuvv1ZZQ9dd9lSvqUkrRhzzvSTrAFuAC4FtgBXJtkyUu0rwK8AfzVy70uB9wGvA7YC70vyksU3W5K0EH2Wd7YC01X1YFWdAA4A24crVNVDVfU54Nsj974ZuLOqjlXVceBOYNsY2i1JWoA+ob8eeHjofKYr62Mx90qSxqxP6GeWsur5+r3uTbIryVSSqaNHj/Z8aUnSfPUJ/Rng/KHzDcCRnq/f696q2ldVE1U1sW7dnNtBS5IWqE/oHwI2J9mU5AxgBzDZ8/XvAC5J8pLuC9xLujJJ0jKYM/Sr6iRwNYOwvh+4taoOJ9mT5K0ASV6bZAa4AvjTJIe7e48Bv89g4DgE7OnKJEnLoNdz+lV1EDg4Unbt0PEhBks3s917E3DTItooSRoTf5ErSQ0x9CWpIYa+JDXE0Jekhhj6ktQQQ1+SGmLoS1JDDH1JaoihL0kNMfQlqSGGviQ1xNCXpIYY+pLUEENfkhpi6EtSQwx9SWqIoS9JDTH0Jakhhr4kNcTQl6SGGPqS1BBDX5IaYuhLUkMMfUlqiKEvSQ0x9CWpIYa+JDXE0Jekhhj6ktQQQ1+SGmLoS1JDDH1Jakiv0E+yLckDSaaT7J7l+plJbumu35NkY1f+giT7k3w+yf1Jrhlv8yVJ8zFn6CdZA9wAXApsAa5MsmWk2lXA8aq6ENgLXN+VXwGcWVWvAl4D/NozA4Ik6fTrM9PfCkxX1YNVdQI4AGwfqbMd2N8d3wZcnCRAAS9KshZ4IXAC+MZYWi5Jmrc+ob8eeHjofKYrm7VOVZ0EHgfOZTAA/C/wCPAV4I+q6tjoGyTZlWQqydTRo0fn3QlJUj99Qj+zlFXPOluBp4HvBzYB70ry8mdVrNpXVRNVNbFu3boeTZIkLUSf0J8Bzh863wAcOVWdbinnbOAY8Hbgn6rqW1X1KPBJYGKxjZYkLUyf0D8EbE6yKckZwA5gcqTOJLCzO74cuLuqisGSzpsy8CLg9cB/jKfpkqT5mjP0uzX6q4E7gPuBW6vqcJI9Sd7aVbsRODfJNPBbwDOPdd4AvBj4AoPB48+q6nNj7oMkqae1fSpV1UHg4EjZtUPHTzJ4PHP0vidmK5ckLQ9/kStJDTH0Jakhhr4kNcTQl6SGGPqS1BBDX5IaYuhLUkMMfUlqiKEvSQ0x9CWpIYa+JDWk1947q8HG3bc/q+yh6y5bhpZI0vJxpi9JDTH0Jakhhr4kNcTQl6SGGPqS1BBDX5IaYuhLUkMMfUlqiKEvSQ0x9CWpIYa+JDXE0Jekhhj6ktQQQ1+SGmLoS1JDDH1JaoihL0kNMfQlqSGGviQ1pFfoJ9mW5IEk00l2z3L9zCS3dNfvSbJx6NpFST6V5HCSzyc5a3zNlyTNx5yhn2QNcANwKbAFuDLJlpFqVwHHq+pCYC9wfXfvWuBjwK9X1SuBNwLfGlvrJUnz0memvxWYrqoHq+oEcADYPlJnO7C/O74NuDhJgEuAz1XVZwGq6mtV9fR4mi5Jmq8+ob8eeHjofKYrm7VOVZ0EHgfOBX4QqCR3JPlMkvcsvsmSpIVa26NOZimrnnXWAm8AXgt8E7gryb1Vddf/uznZBewCuOCCC3o0SZK0EH1m+jPA+UPnG4Ajp6rTreOfDRzryv+lqh6rqm8CB4EfHX2DqtpXVRNVNbFu3br590KS1Euf0D8EbE6yKckZwA5gcqTOJLCzO74cuLuqCrgDuCjJd3eDwU8CXxxP0yVJ8zXn8k5VnUxyNYMAXwPcVFWHk+wBpqpqErgRuDnJNIMZ/o7u3uNJPsRg4CjgYFXdvkR9kSTNoc+aPlV1kMHSzHDZtUPHTwJXnOLejzF4bFOStMz8Ra4kNcTQl6SG9FreWc027n72VwwPXXfZMrREkpaeM31JaoihL0kNMfQlqSGGviQ1xNCXpIYY+pLUEENfkhpi6EtSQwx9SWqIoS9JDWl+G4bn4hYNklYbZ/qS1BBDX5IaYuhLUkMMfUlqiKEvSQ0x9CWpIYa+JDXE0Jekhhj6ktQQQ1+SGmLoS1JDDH1JaoihL0kNcZfNBXD3TUkrlTN9SWqIoS9JDTH0Jakhhr4kNaRX6CfZluSBJNNJds9y/cwkt3TX70myceT6BUmeSPLu8TRbkrQQc4Z+kjXADcClwBbgyiRbRqpdBRyvqguBvcD1I9f3Av+4+OZKkhajz0x/KzBdVQ9W1QngALB9pM52YH93fBtwcZIAJHkb8CBweDxNliQtVJ/QXw88PHQ+05XNWqeqTgKPA+cmeRHwO8AHFt9USdJi9Qn9zFJWPet8ANhbVU885xsku5JMJZk6evRojyZJkhaizy9yZ4Dzh843AEdOUWcmyVrgbOAY8Drg8iR/CJwDfDvJk1X1x8M3V9U+YB/AxMTE6IAiSRqTPqF/CNicZBPw38AO4O0jdSaBncCngMuBu6uqgB9/pkKS9wNPjAa+JOn0mTP0q+pkkquBO4A1wE1VdTjJHmCqqiaBG4Gbk0wzmOHvWMpGS5IWpteGa1V1EDg4Unbt0PGTwBVzvMb7F9A+SdIY+YtcSWqIoS9JDXE//TFyn31Jz3fO9CWpIYa+JDXE0Jekhhj6ktQQQ1+SGuLTO6eJT/ZIej5wpi9JDTH0Jakhhr4kNcTQl6SGGPqS1BBDX5IaYuhLUkMMfUlqiD/OWmb+aEvS6eRMX5Ia4kz/eWq2TwDgpwBJi+NMX5IaYuhLUkMMfUlqiKEvSQ0x9CWpIYa+JDXE0Jekhhj6ktQQQ1+SGmLoS1JDDH1JaoihL0kN6bXhWpJtwEeANcBHq+q6ketnAn8BvAb4GvDzVfVQkp8GrgPOAE4Av11Vd4+x/U1yO2ZJCzXnTD/JGuAG4FJgC3Blki0j1a4CjlfVhcBe4Pqu/DHgZ6vqVcBO4OZxNVySNH99lne2AtNV9WBVnQAOANtH6mwH9nfHtwEXJ0lV3VdVR7ryw8BZ3acCSdIy6BP664GHh85nurJZ61TVSeBx4NyROj8H3FdVT42+QZJdSaaSTB09erRv2yVJ89RnTT+zlNV86iR5JYMln0tme4Oq2gfsA5iYmBh9bfXkWr+kufQJ/Rng/KHzDcCRU9SZSbIWOBs4BpBkA/C3wC9X1ZcX3WItiAOCJOi3vHMI2JxkU5IzgB3A5EidSQZf1AJcDtxdVZXkHOB24Jqq+uS4Gi1JWpg5Z/pVdTLJ1cAdDB7ZvKmqDifZA0xV1SRwI3BzkmkGM/wd3e1XAxcC703y3q7skqp6dNwd0cL4CUBqS6/n9KvqIHBwpOzaoeMngStmue+DwAcX2UZJ0pj0Cn21x08A0urkNgyS1BBDX5Ia4vKO5s2lH2nlcqYvSQ0x9CWpIS7vaGxc9pGe/5zpS1JDnOlryc32CQD8FCAtB2f6ktQQZ/paVn4PIJ1ehr6el55rMHCgkBbO0Neq4WAgzc3Q16rnpwbpOwx9aRanGgwcJLTSGfrSmDggaCUw9KUl5mCg5xNDX1omz/WjNQcKLRVDX1pBFvKltAOIhhn6UqPGOYA4sKwchr6kJbWQgcJBZOkY+pJWDAeQxTP0JTWp1QHE0JekRZrPAPJc107HAOLWypLUEENfkhpi6EtSQwx9SWqIoS9JDTH0Jakhhr4kNaRX6CfZluSBJNNJds9y/cwkt3TX70mycejaNV35A0nePL6mS5Lma87QT7IGuAG4FNgCXJlky0i1q4DjVXUhsBe4vrt3C7ADeCWwDfiT7vUkScugz0x/KzBdVQ9W1QngALB9pM52YH93fBtwcZJ05Qeq6qmq+k9guns9SdIy6BP664GHh85nurJZ61TVSeBx4Nye90qSTpNU1XNXSK4A3lxVv9qd/xKwtap+Y6jO4a7OTHf+ZQYz+j3Ap6rqY135jcDBqvr4yHvsAnZ1pz8EPDCGvp0HPDaG11mpWu5/y30H+99q/3+gqtbNVanPhmszwPlD5xuAI6eoM5NkLXA2cKznvVTVPmBfj7b0lmSqqibG+ZorScv9b7nvYP9b7/9c+izvHAI2J9mU5AwGX8xOjtSZBHZ2x5cDd9fgI8QksKN7umcTsBn49HiaLkmarzln+lV1MsnVwB3AGuCmqjqcZA8wVVWTwI3AzUmmGczwd3T3Hk5yK/BF4CTwjqp6eon6Ikmaw5xr+itVkl3dslGTWu5/y30H+996/+eyakNfkvRsbsMgSQ1ZlaE/17YRq02Sm5I8muQLQ2UvTXJnki91/33JcrZxqSQ5P8knktyf5HCSd3blrfT/rCSfTvLZrv8f6Mo3dVuifKnbIuWM5W7rUkmyJsl9Sf6hO2+m7wux6kK/57YRq82fM9jmYthu4K6q2gzc1Z2vRieBd1XVK4DXA+/o/n+30v+ngDdV1Q8Drwa2JXk9g61Q9nb9P85gq5TV6p3A/UPnLfV93lZd6NNv24hVpar+lcFTU8OGt8bYD7zttDbqNKmqR6rqM93x/zD4419PO/2vqnqiO31B96+ANzHYEgVWcf+TbAAuAz7anYdG+r5QqzH03fph4Puq6hEYBCPwvcvcniXX7e76I8A9NNT/bnnj34FHgTuBLwNf77ZEgdX9N/Bh4D3At7vzc2mn7wuyGkM/s5T5iNIql+TFwMeB36yqbyx3e06nqnq6ql7N4BfvW4FXzFbt9LZq6SV5C/BoVd07XDxL1VXX98Xosw3DStNr64cGfDXJy6rqkSQvYzALXJWSvIBB4P9lVf1NV9xM/59RVV9P8s8Mvts4J8nabsa7Wv8Gfgx4a5KfAc4CvofBzL+Fvi/Yapzp99k2ogXDW2PsBP5+GduyZLo13BuB+6vqQ0OXWun/uiTndMcvBH6Kwfcan2CwJQqs0v5X1TVVtaGqNjL4O7+7qn6BBvq+GKvyx1ndyP9hvrNtxB8sc5OWVJK/Bt7IYHfBrwLvA/4OuBW4APgKcEVVjX7Zu+IleQPwb8Dn+c667u8yWNdvof8XMfiycg2DSdytVbUnycsZPMTwUuA+4Ber6qnla+nSSvJG4N1V9ZbW+j5fqzL0JUmzW43LO5KkUzD0Jakhhr4kNcTQl6SGGPqS1BBDX5IaYuhLUkMMfUlqyP8Bst/13MP2gdUAAAAASUVORK5CYII=\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "%matplotlib inline\n",
    "\n",
    "plt.bar(range(len(pca.explained_variance_ratio_)), pca.explained_variance_ratio_)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 问答题1和2解答：  \n",
    "从降维后的方差分布图看，第一主成分解释了较多的方差，之后的主成分解释的方差逐渐减少"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "保存结果"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "n_components = pca.n_components_\n",
    "feat_names_pca = []\n",
    "for i in range(n_components):\n",
    "    feat_names_pca.append(\"pca\"+str(i))\n",
    "y = pd.Series(y_train,name = \"target\")\n",
    "train_pca = pd.concat([train_id,pd.DataFrame(columns=feat_names_pca,data=X_train_pca),y],axis=1)\n",
    "train_pca.to_csv(dpath + \"Otto_train_tfidf_pca.csv\",index=False,header=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pickle\n",
    "pickle.dump(pca,open(\"pca.pkl\",\"wb\"))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
